library(readr)


# 1. Open files ----

dt <- readRDS(paste0(PSD-DATA-FOLDER, "htb_psd.rds"))

cra <- readRDS(paste0(CRA-DATA-FOLDER, "htb_cra.rds"))


# 2. Match ----

# a function to match

match_round <- function(conditions, start_matched_dset) {
  
  tdt <- cra[!(cra_id %in% start_matched_dset$cra_id) & !(account_id %in% start_matched_dset$acount_id)
  ][dt[!(cus_id %in% start_matched_dset$cus_id)],
    on = c("yob" = "yob_main", conditions),
    nomatch = 0]
  
  message(paste0("New matches with yob_main: ", tdt[, .N]))
  
  new_match <- tdt[, .(cus_id, cra_id, account_id)]
  
  all_matched <- rbind(start_matched_dset, new_match, use.names = T)
  
  rm(new_match, tdt)
  
  # perfect match but with second borrower
  
  tdt <- cra[!(cra_id %in% all_matched$cra_id) & !(account_id %in% all_matched$acount_id)
  ][dt[!(cus_id %in% all_matched$cus_id)],
    on = c("yob" = "yob_second", conditions),
    nomatch = 0]
  
  message(paste0("New matches with yob_second: ", tdt[, .N]))
  
  new_match <- tdt[, .(cus_id, cra_id, account_id)]
  
  all_matched <- rbind(all_matched, new_match, use.names = T)
  
  rm(new_match, tdt)
  
  # result
  
  message(paste0("Tot matched custumers: ", nrow(all_matched)))
  message(paste0("This is ", round(nrow(all_matched) / nrow(dt) *100, 1), "% of custumers"))
  
  return(all_matched)
  
}

# now apply the function to round match

start_matching <- data.table(cus_id = NA, cra_id = NA, account_id = NA)

all_matched <- match_round(conditions = c( "lsoa_post_mtg" = "lsoa_id",
                                           "frn" = "firm_frn",
                                           "open_date" = "origination_date",
                                           "open_bal" = "loan_value"), start_matching)

all_matched <- match_round(conditions = c( "lsoa_post_mtg" = "lsoa_id",
                                           "frn" = "firm_frn",
                                           "open_date" = "origination_date",
                                           "open_bal_adj" = "loan_value"), all_matched)

all_matched <- match_round(conditions = c( "lsoa_pre_mtg" = "lsoa_id",
                                           "frn" = "firm_frn",
                                           "open_date" = "origination_date",
                                           "open_bal" = "loan_value"), all_matched)


# now less good matches on loan size

cra[, open_bal_fuzz :=  round(open_bal, - 3)]
dt[, loan_value_fuzz :=  round(loan_value, - 3)]
cra[, open_bal_adj_fuzz :=  round(open_bal_adj, - 3)]

all_matched <- match_round(conditions = c( "lsoa_post_mtg" = "lsoa_id",
                                           "frn" = "firm_frn",
                                           "open_date" = "origination_date",
                                           "open_bal_fuzz" = "loan_value_fuzz"), all_matched)

all_matched <- match_round(conditions = c( "lsoa_pre_mtg" = "lsoa_id",
                                           "frn" = "firm_frn",
                                           "open_date" = "origination_date",
                                           "open_bal_fuzz" = "loan_value_fuzz"), all_matched)

all_matched <- match_round(conditions = c( "lsoa_post_mtg" = "lsoa_id",
                                           "frn" = "firm_frn",
                                           "open_date" = "origination_date",
                                           "open_bal_adj_fuzz" = "loan_value_fuzz"), all_matched)

# save

op1 <- all_matched[2:nrow(all_matched)]

di <- dt[op1, on = "cus_id"]

write_rds(di, paste0(DATADIR, "htb_cus_id_matched_cra.rds"))


# 3. Keep only the matched IDs, and match again on current account data ----

grab_cato <- function(part) {
  
  pd <- read_rds(glue("{CRA-DATA-FOLDER}/part_{part}/cato.rds"))
  
  pd <- pd[cra_id %in% di[, cra_id]]
  
}

cato <- rbindlist(lapply(1:10, grab_cato))

# calculate match rate
message("proportion of cra_id in 2019 CRA CATO:")
round(cato[, uniqueN(cra_id)]/di[, uniqueN(cra_id)], 2)

# save
saveRDS(cato, paste0(DATADIR, "htb_cra-cato-matched.rds"))
